In [10]:

    
from __future__ import division
import re
from collections import defaultdict
import math
from collections import Counter
from ml import *



In [2]:

    
# Beware underflow and use pseudo-counts to correct for rare word, label combos



In [71]:

    
def drop_final_s(word):
    return re.sub("s$", "", word)

def tokenize(message, drop_s = False):
    message = message.lower()
    all_words = re.findall("[a-z0-9']+", message)
    if drop_s == True:
        all_words = [drop_final_s(w)
                     for w in all_words]
    return set(all_words)

def count_words(training_set):
    """training set is (message, is_spam)"""
    
    counts = defaultdict(lambda: [0,0])
    
    for message, is_spam in training_set: # for each message
        for word in tokenize(message, drop_s=False): # split into words
            counts[word][0 if is_spam else 1] += 1 # for the given word increment spam or not spam count
    
    return counts

def word_probabilities(counts, total_spams, total_non_spams, k = 0.5):
    """take word, label counts and output conditional probabilities"""
    return [(w,
           (spam + k) / (total_spams + 2 * k),
           (non_spam + k) / (total_non_spams + 2 * k))
           for w, (spam, non_spam) in counts.iteritems()]

def spam_probability(word_probs, message):
    message_words = tokenize(message)
    
    log_prob_if_spam = log_prob_if_not_spam = 0.0
    
    # for all words in message collect probabilities
    for word, prob_if_spam, prob_if_not_spam in word_probs:
        if word in message_words:
            log_prob_if_spam += math.log(prob_if_spam)
            log_prob_if_not_spam += math.log(prob_if_not_spam)
        else:
            log_prob_if_spam += math.log(1.0 - prob_if_spam)
            log_prob_if_not_spam += math.log(1.0 - prob_if_not_spam)
    
    prob_if_spam = math.exp(log_prob_if_spam)
    prob_if_not_spam = math.exp(log_prob_if_not_spam)
    
    return prob_if_spam / (prob_if_spam + prob_if_not_spam)

class NaiveBayesClassifier:
    
    def __init__(self, k = 0.5):
        self.k = k
        self.word_probs = []
        
    def train(self, training_set):
        # count the spam and non-spam
        num_spams = len([is_spam
                        for message, is_spam in training_set
                        if is_spam])
        
        num_non_spams = len(training_set) - num_spams
        
        # run word pipeline
        word_counts = count_words(training_set)
        
        self.word_probs = word_probabilities(word_counts,
                                            num_spams,
                                            num_non_spams,
                                            self.k)
        
    def classify(self, message):
        return spam_probability(self.word_probs, message)

Testing



In [72]:

    
import glob

path = r'.\spam\*\*'

data = []

for fn in glob.glob(path):
    is_spam = "ham" not in fn
    
    with open(fn,'r') as file:
        for line in file:
            if line.startswith("Subject:"):
                subject = re.sub(r"^Subject: ", "", line).strip()
                data.append((subject, is_spam))



In [73]:

    
len(test_data)









    Out[73]:





876



In [74]:

    
random.seed(0)
train_data, test_data = split_data(data, 0.75)

classifier = NaiveBayesClassifier()

classifier.train(train_data)



In [75]:

    
classified = [(subject, is_spam, classifier.classify(subject))
             for subject, is_spam in test_data]

counts = Counter((is_spam, spam_probability > 0.5)
                for _, is_spam, spam_probability in classified)



In [76]:

    
counts









    Out[76]:





Counter({(False, False): 704, (True, True): 101, (True, False): 38, (False, True): 33})



In [77]:

    
tn = counts[(False,False)]
tp = counts[(True,True)]
fn = counts[(True,False)]
fp = counts[(False,True)]



In [78]:

    
print accuracy(tp, fp, fn, tn)
print precision(tp, fp, fn, tn)
print recall(tp, fp, fn, tn)
print f1_score(tp, fp, fn, tn)









    



0.918949771689
0.753731343284
0.726618705036
0.739926739927



In [51]:

    
classified.sort(key = lambda row: row[2])



In [52]:

    
spammiest_hams = filter(lambda row: not row[1], classified)[-5:]



In [53]:

    
spammiest_hams









    Out[53]:





[('Attn programmers: support offered [FLOSS-Sarai Initiative]',
  False,
  0.9756129605142009),
 ('2000+ year old Greek computer reinterpreted', False, 0.983535500810437),
 ('What to look for in your next smart phone (Tech Update)',
  False,
  0.989871920690335),
 ('[ILUG-Social] Re: Important - reenactor insurance needed',
  False,
  0.9995349057803377),
 ('[ILUG-Social] Re: Important - reenactor insurance needed',
  False,
  0.9995349057803377)]



In [54]:

    
hammiest_spams = filter(lambda row: row[1], classified)[:5]



In [55]:

    
hammiest_spams









    Out[55]:





[('Re: girls', True, 0.0009525186158414719),
 ('Introducing Chase Platinum for Students with a 0% Introductory APR',
  True,
  0.0012566691211091483),
 ('.Message report from your contact page....//ytu855 rkq',
  True,
  0.0015109358288617229),
 ('Testing a system, please delete', True, 0.0026920538836874364),
 ('Never pay for the goodz again (8SimUgQ)', True, 0.00591162322193142)]



In [57]:

    
def p_spam_given_word(word_prob):
    """calculate p(spam|word)"""
    
    word, prob_if_spam, prob_if_not_spam = word_prob
    return prob_if_spam / (prob_if_spam + prob_if_not_spam)


words = sorted(classifier.word_probs, key = p_spam_given_word)



In [58]:

    
words[-5:]









    Out[58]:





[('year', 0.028767123287671233, 0.00022893772893772894),
 ('sale', 0.031506849315068496, 0.00022893772893772894),
 ('rates', 0.031506849315068496, 0.00022893772893772894),
 ('systemworks', 0.036986301369863014, 0.00022893772893772894),
 ('money', 0.03972602739726028, 0.00022893772893772894)]



In [59]:

    
words[:5]









    Out[59]:





[('spambayes', 0.0013698630136986301, 0.04601648351648352),
 ('users', 0.0013698630136986301, 0.036401098901098904),
 ('razor', 0.0013698630136986301, 0.030906593406593408),
 ('zzzzteana', 0.0013698630136986301, 0.029075091575091576),
 ('sadev', 0.0013698630136986301, 0.026785714285714284)]



In [ ]: